In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from prophet import Prophet
import geopandas as gpd
import plotly.express as px
# Load dataset
df = pd.read_json("cleaned_data.json")
### 📊 USER ENGAGEMENT ANALYSIS ###
df["engagement_score"] = df["score"] + (df["num_comments"] * 2) # Weight comments more
plt.figure(figsize=(10, 5))
sns.histplot(df["engagement_score"], bins=50, kde=True)
plt.title("User Engagement Score Distribution")
plt.xlabel("Engagement Score")
plt.ylabel("Frequency")
plt.show()
### 🌍 LOCATION ANALYSIS ###
df["location"] = df["location"].fillna("Unknown") # Handle missing locations
# Count posts by location
location_counts = df["location"].value_counts().reset_index()
location_counts.columns = ["Location", "Count"]
fig = px.bar(location_counts, x="Location", y="Count", title="Post Count by Location")
fig.show()
# If you want to visualize location data on a map:
# Ensure you have a dataset that maps locations to lat/lon
# Example: Indian states map (Uncomment below if applicable)
# india_map = gpd.read_file("india_states.geojson") # Load India map GeoJSON
# india_map = india_map.merge(location_counts, left_on="state_name", right_on="Location", how="left")
# india_map.plot(column="Count", cmap="Blues", legend=True)
# plt.title("Engagement by State")
# plt.show()
### 📈 TREND ANALYSIS ###
df["created_utc"] = pd.to_datetime(df["created_utc"]) # Convert to datetime
# Group by date and aggregate scores to analyze trends
time_series = df.groupby(df["created_utc"].dt.date)["score"].sum().reset_index()
time_series.columns = ["ds", "y"]
# Apply Prophet for trend forecasting
model = Prophet()
model.fit(time_series)
future = model.make_future_dataframe(periods=30) # Predict for next 30 days
forecast = model.predict(future)
fig = model.plot(forecast)
plt.title("Trend Analysis of Post Scores Over Time")
plt.show()
02:44:29 - cmdstanpy - INFO - Chain [1] start processing 02:44:30 - cmdstanpy - INFO - Chain [1] done processing
In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Load dataset
df = pd.read_json("cleaned_data.json")
# Ensure 'text' column exists
if "text" not in df.columns:
raise ValueError("Dataset must contain a 'text' column.")
### 🤖 SENTIMENT ANALYSIS USING TRANSFORMERS ###
sentiment_model = pipeline("sentiment-analysis")
df["huggingface_sentiment"] = df["text"].apply(lambda x: sentiment_model(x)[0]["label"])
### 🔥 SENTIMENT ANALYSIS USING VADER (Lexicon-based) ###
vader = SentimentIntensityAnalyzer()
def vader_sentiment(text):
score = vader.polarity_scores(text)["compound"]
if score >= 0.05:
return "POSITIVE"
elif score <= -0.05:
return "NEGATIVE"
else:
return "NEUTRAL"
df["vader_sentiment"] = df["text"].apply(vader_sentiment)
### 📊 SENTIMENT DISTRIBUTION ###
plt.figure(figsize=(10, 5))
sns.countplot(x=df["huggingface_sentiment"], palette="coolwarm")
plt.title("Sentiment Analysis (Hugging Face)")
plt.show()
plt.figure(figsize=(10, 5))
sns.countplot(x=df["vader_sentiment"], palette="viridis")
plt.title("Sentiment Analysis (VADER)")
plt.show()
### 🔍 PRINT SAMPLE RESULTS ###
print(df[["text", "huggingface_sentiment", "vader_sentiment"]].head(10))
Positive
In [15]:
### 🌍 LOCATION ANALYSIS ###
df["location"] = df["location"].fillna("Unknown") # Handle missing locations
# Separate known and unknown locations
known_locations = df[df["location"] != "Unknown"]
unknown_locations = df[df["location"] == "Unknown"]
# Count posts by location
location_counts = known_locations["location"].value_counts().reset_index()
location_counts.columns = ["Location", "Count"]
# Add "Unknown" as a separate category
unknown_count = len(unknown_locations)
if unknown_count > 0:
location_counts = pd.concat([location_counts, pd.DataFrame([{"Location": "Unknown", "Count": unknown_count}])], ignore_index=True)
# Bar plot for location counts
fig = px.bar(location_counts, x="Location", y="Count", title="Post Count by Location", text="Count")
fig.update_traces(textposition="outside")
fig.show()
In [16]:
### 🌍 CATEGORY VS LOCATION ANALYSIS ###
df["location"] = df["location"].fillna("Unknown") # Handle missing locations
df["category"] = df["category"].fillna("Uncategorized") # Handle missing categories
# Count posts by category and location
category_location_counts = df.groupby(["location", "category"]).size().reset_index(name="Count")
# Plot category-wise distribution across locations
fig = px.bar(
category_location_counts,
x="location",
y="Count",
color="category",
title="Category Distribution by Location",
text="Count",
barmode="stack"
)
fig.update_traces(textposition="outside")
fig.show()
In [ ]: